lend = read.csv("lending_club_loan.csv", header = TRUE)
library(tidyr)
lend = drop_na(lend)
dim(lend)
## [1] 358014 27
colnames(lend)
## [1] "loan_amnt" "term" "int_rate"
## [4] "installment" "grade" "sub_grade"
## [7] "emp_title" "emp_length" "home_ownership"
## [10] "annual_inc" "verification_status" "issue_d"
## [13] "loan_status" "purpose" "title"
## [16] "dti" "earliest_cr_line" "open_acc"
## [19] "pub_rec" "revol_bal" "revol_util"
## [22] "total_acc" "initial_list_status" "application_type"
## [25] "mort_acc" "pub_rec_bankruptcies" "address"
head(lend)
## loan_amnt term int_rate installment grade sub_grade
## 1 10000 36 months 11.44 329.48 B B4
## 2 8000 36 months 11.99 265.68 B B5
## 3 15600 36 months 10.49 506.97 B B3
## 4 7200 36 months 6.49 220.65 A A2
## 5 24375 60 months 17.27 609.33 C C5
## 6 20000 36 months 13.33 677.07 C C3
## emp_title emp_length home_ownership annual_inc
## 1 Marketing 10+ years RENT 117000
## 2 Credit analyst 4 years MORTGAGE 65000
## 3 Statistician < 1 year RENT 43057
## 4 Client Advocate 6 years RENT 54000
## 5 Destiny Management Inc. 9 years MORTGAGE 55000
## 6 HR Specialist 10+ years MORTGAGE 86788
## verification_status issue_d loan_status purpose
## 1 Not Verified Jan-2015 Fully Paid vacation
## 2 Not Verified Jan-2015 Fully Paid debt_consolidation
## 3 Source Verified Jan-2015 Fully Paid credit_card
## 4 Not Verified Nov-2014 Fully Paid credit_card
## 5 Verified Apr-2013 Charged Off credit_card
## 6 Verified Sep-2015 Fully Paid debt_consolidation
## title dti earliest_cr_line open_acc pub_rec revol_bal
## 1 Vacation 26.24 Jun-1990 16 0 36369
## 2 Debt consolidation 22.05 Jul-2004 17 0 20131
## 3 Credit card refinancing 12.79 Aug-2007 13 0 11987
## 4 Credit card refinancing 2.60 Sep-2006 6 0 5472
## 5 Credit Card Refinance 33.95 Mar-1999 13 0 24584
## 6 Debt consolidation 16.31 Jan-2005 8 0 25757
## revol_util total_acc initial_list_status application_type mort_acc
## 1 41.8 25 w INDIVIDUAL 0
## 2 53.3 27 f INDIVIDUAL 3
## 3 92.2 26 f INDIVIDUAL 0
## 4 21.5 13 f INDIVIDUAL 0
## 5 69.8 43 f INDIVIDUAL 1
## 6 100.6 23 f INDIVIDUAL 4
## pub_rec_bankruptcies
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## address
## 1 0174 Michelle Gateway\nMendozaberg, OK 22690
## 2 1076 Carney Fort Apt. 347\nLoganmouth, SD 05113
## 3 87025 Mark Dale Apt. 269\nNew Sabrina, WV 05113
## 4 823 Reid Ford\nDelacruzside, MA 00813
## 5 679 Luna Roads\nGreggshire, VA 11650
## 6 1726 Cooper Passage Suite 129\nNorth Deniseberg, DE 30723
library(DescTools)
Desc(lend)
## ------------------------------------------------------------------------------
## Describe lend (data.frame):
##
## data frame: 358014 obs. of 27 variables
## 358014 complete cases (100.0%)
##
## Nr ColName Class NAs Levels
## 1 loan_amnt numeric .
## 2 term factor . (2): 1- 36 months, 2- 60 months
## 3 int_rate numeric .
## 4 installment numeric .
## 5 grade factor . (7): 1-A, 2-B, 3-C, 4-D, 5-E, ...
## 6 sub_grade factor . (35): 1-A1, 2-A2, 3-A3, 4-A4,
## 5-A5, ...
## 7 emp_title factor . (173106): 1-, 2- NSA Industries
## llc, 3- Fibro Source, 4- Long
## Ilsand College Hospital, 5-
## mortgage banker, ...
## 8 emp_length factor . (12): 1-, 2-< 1 year, 3-1 year,
## 4-10+ years, 5-2 years, ...
## 9 home_ownership factor . (6): 1-ANY, 2-MORTGAGE, 3-NONE,
## 4-OTHER, 5-OWN, ...
## 10 annual_inc numeric .
## 11 verification_status factor . (3): 1-Not Verified, 2-Source
## Verified, 3-Verified
## 12 issue_d factor . (115): 1-Apr-2008, 2-Apr-2009,
## 3-Apr-2010, 4-Apr-2011,
## 5-Apr-2012, ...
## 13 loan_status factor . (2): 1-Charged Off, 2-Fully Paid
## 14 purpose factor . (14): 1-car, 2-credit_card,
## 3-debt_consolidation,
## 4-educational,
## 5-home_improvement, ...
## 15 title factor . (48818): 1-, 2- credit_card, 3-
## debt_consolidation, 4- other, 5-
## small_business, ...
## 16 dti numeric .
## 17 earliest_cr_line factor . (684): 1-Apr-1955, 2-Apr-1958,
## 3-Apr-1960, 4-Apr-1961,
## 5-Apr-1962, ...
## 18 open_acc numeric .
## 19 pub_rec numeric .
## 20 revol_bal numeric .
## 21 revol_util numeric .
## 22 total_acc numeric .
## 23 initial_list_status factor . (2): 1-f, 2-w
## 24 application_type factor . (3): 1-DIRECT_PAY, 2-INDIVIDUAL,
## 3-JOINT
## 25 mort_acc numeric .
## 26 pub_rec_bankruptcies numeric .
## 27 address factor . (393700): 1-000 Adam Station Apt.
## 329 Ashleyberg, AZ 22690, 2-000
## Adrian Cliffs Randyton, LA 22690,
## 3-000 Alexandria Street Port
## Richard, FL 22690, 4-000 Amber
## Court Lake Pamelatown, IN 00813,
## 5-000 Amy Pines Suite 498 South
## Susan, ND 22690, ...
##
##
## ------------------------------------------------------------------------------
## 1 - loan_amnt (numeric)
##
## length n NAs unique 0s mean meanCI'
## 358'014 358'014 0 1'390 0 14'386.90 14'359.43
## 100.0% 0.0% 0.0% 14'414.37
##
## .05 .10 .25 median .75 .90 .95
## 3'500.00 5'000.00 8'000.00 12'000.00 20'000.00 27'000.00 31'825.00
##
## range sd vcoef mad IQR skew kurt
## 39'000.00 8'385.10 0.58 8'154.30 12'000.00 0.75 -0.12
##
## lowest : 1'000.0 (1'155), 1'025.0 (4), 1'050.0 (6), 1'075.0 (5), 1'100.0 (34)
## highest: 39'475.0, 39'500.0, 39'600.0, 39'700.0, 40'000.0 (179)
##
## heap(?): remarkable frequency (6.9%) for the mode(s) (= 10000)
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 2 - term (factor - dichotomous)
##
## length n NAs unique
## 358'014 358'014 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 36 months 273'605 76.4% 76.3% 76.6%
## 60 months 84'409 23.6% 23.4% 23.7%
##
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------
## 3 - int_rate (numeric)
##
## length n NAs unique 0s mean meanCI'
## 358'014 358'014 0 265 0 13.80 13.79
## 100.0% 0.0% 0.0% 13.82
##
## .05 .10 .25 median .75 .90 .95
## 6.92 7.90 10.74 13.44 16.78 19.72 21.99
##
## range sd vcoef mad IQR skew kurt
## 25.67 4.50 0.33 4.61 6.04 0.41 -0.17
##
## lowest : 5.32 (2'440), 5.93 (431), 6.0 (56), 6.03 (5'675), 6.24 (1'184)
## highest: 30.79 (9), 30.84, 30.89 (3), 30.94 (3), 30.99 (13)
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 4 - installment (numeric)
##
## length n NAs unique 0s mean meanCI'
## 358'014 358'014 0 50'208 0 441.818 440.993
## 100.0% 0.0% 0.0% 442.643
##
## .05 .10 .25 median .75 .90 .95
## 118.693 166.050 260.460 385.120 580.450 794.650 939.004
##
## range sd vcoef mad IQR skew kurt
## 1'512.190 251.912 0.570 223.257 319.990 0.969 0.733
##
## lowest : 21.62, 23.61, 28.75, 28.82, 29.52
## highest: 1'464.420, 1'479.490, 1'503.850, 1'527.0, 1'533.810
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 5 - grade (factor)
##
## length n NAs unique levels dupes
## 358'014 358'014 0 7 7 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 B 104'416 29.2% 104'416 29.2%
## 2 C 98'353 27.5% 202'769 56.6%
## 3 D 58'558 16.4% 261'327 73.0%
## 4 A 54'255 15.2% 315'582 88.1%
## 5 E 28'871 8.1% 344'453 96.2%
## 6 F 10'792 3.0% 355'245 99.2%
## 7 G 2'769 0.8% 358'014 100.0%

## ------------------------------------------------------------------------------
## 6 - sub_grade (factor)
##
## length n NAs unique levels dupes
## 358'014 358'014 0 35 35 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 B3 23'768 6.6% 23'768 6.6%
## 2 B4 23'219 6.5% 46'987 13.1%
## 3 C1 21'612 6.0% 68'599 19.2%
## 4 C2 20'617 5.8% 89'216 24.9%
## 5 B2 20'491 5.7% 109'707 30.6%
## 6 C3 19'840 5.5% 129'547 36.2%
## 7 B5 19'557 5.5% 149'104 41.6%
## 8 C4 19'143 5.3% 168'247 47.0%
## 9 B1 17'381 4.9% 185'628 51.8%
## 10 C5 17'141 4.8% 202'769 56.6%
## 11 A5 15'925 4.4% 218'694 61.1%
## 12 D1 15'081 4.2% 233'775 65.3%
## ... etc.
## [list output truncated]

## ------------------------------------------------------------------------------
## 7 - emp_title (factor)
##
## length n NAs unique levels dupes
## 358'014 358'014 0 151'302 173'106 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 20'486 5.7% 20'486 5.7%
## 2 Teacher 4'387 1.2% 24'873 6.9%
## 3 Manager 4'249 1.2% 29'122 8.1%
## 4 Registered Nurse 1'855 0.5% 30'977 8.7%
## 5 RN 1'844 0.5% 32'821 9.2%
## 6 Supervisor 1'830 0.5% 34'651 9.7%
## 7 Sales 1'636 0.5% 36'287 10.1%
## 8 Project Manager 1'503 0.4% 37'790 10.6%
## 9 Owner 1'410 0.4% 39'200 10.9%
## 10 Driver 1'337 0.4% 40'537 11.3%
## 11 Office Manager 1'217 0.3% 41'754 11.7%
## 12 manager 1'145 0.3% 42'899 12.0%
## ... etc.
## [list output truncated]

## ------------------------------------------------------------------------------
## 8 - emp_length (factor)
##
## length n NAs unique levels dupes
## 358'014 358'014 0 12 12 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 10+ years 117'323 32.8% 117'323 32.8%
## 2 2 years 31'720 8.9% 149'043 41.6%
## 3 3 years 27'866 7.8% 176'909 49.4%
## 4 < 1 year 27'538 7.7% 204'447 57.1%
## 5 5 years 23'345 6.5% 227'792 63.6%
## 6 1 year 22'841 6.4% 250'633 70.0%
## 7 4 years 20'656 5.8% 271'289 75.8%
## 8 7 years 19'038 5.3% 290'327 81.1%
## 9 6 years 18'629 5.2% 308'956 86.3%
## 10 8 years 17'735 5.0% 326'691 91.3%
## 11 17'239 4.8% 343'930 96.1%
## 12 9 years 14'084 3.9% 358'014 100.0%

## ------------------------------------------------------------------------------
## 9 - home_ownership (factor)
##
## length n NAs unique levels dupes
## 358'014 358'014 0 6 6 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 MORTGAGE 181'592 50.7% 181'592 50.7%
## 2 RENT 141'604 39.6% 323'196 90.3%
## 3 OWN 34'752 9.7% 357'948 100.0%
## 4 OTHER 34 0.0% 357'982 100.0%
## 5 NONE 29 0.0% 358'011 100.0%
## 6 ANY 3 0.0% 358'014 100.0%

## ------------------------------------------------------------------------------
## 10 - annual_inc (numeric)
##
## length n NAs unique 0s mean'
## 358'014 358'014 0 24'723 1 74'746.46
## 100.0% 0.0% 0.0%
##
## .05 .10 .25 median .75 .90
## 28'800.00 35'000.00 45'400.75 65'000.00 90'000.00 122'000.00
##
## range sd vcoef mad IQR skew
## 8'706'582.00 61'407.27 0.82 29'652.00 44'599.25 42.14
##
## meanCI
## 74'545.31
## 74'947.61
##
## .95
## 150'000.00
##
## kurt
## 4'462.80
##
## lowest : 0.0, 600.0, 2'500.0, 4'000.0, 4'524.0
## highest: 7'000'000.0, 7'141'778.0, 7'446'395.0, 7'600'000.0, 8'706'582.0
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 11 - verification_status (factor)
##
## length n NAs unique levels dupes
## 358'014 358'014 0 3 3 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 Verified 127'154 35.5% 127'154 35.5%
## 2 Source Verified 121'220 33.9% 248'374 69.4%
## 3 Not Verified 109'640 30.6% 358'014 100.0%

## ------------------------------------------------------------------------------
## 12 - issue_d (factor)
##
## length n NAs unique levels dupes
## 358'014 358'014 0 58 115 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 Oct-2014 14'838 4.1% 14'838 4.1%
## 2 Jul-2014 12'597 3.5% 27'435 7.7%
## 3 Jan-2015 11'701 3.3% 39'136 10.9%
## 4 Dec-2013 10'609 3.0% 49'745 13.9%
## 5 Nov-2013 10'492 2.9% 60'237 16.8%
## 6 Jul-2015 10'260 2.9% 70'497 19.7%
## 7 Oct-2013 10'040 2.8% 80'537 22.5%
## 8 Jan-2014 9'702 2.7% 90'239 25.2%
## 9 Apr-2015 9'466 2.6% 99'705 27.8%
## 10 Sep-2013 9'172 2.6% 108'877 30.4%
## 11 Aug-2013 9'100 2.5% 117'977 33.0%
## 12 Apr-2014 9'012 2.5% 126'989 35.5%
## ... etc.
## [list output truncated]

## ------------------------------------------------------------------------------
## 13 - loan_status (factor - dichotomous)
##
## length n NAs unique
## 358'014 358'014 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## Charged Off 72'078 20.1% 20.0% 20.3%
## Fully Paid 285'936 79.9% 79.7% 80.0%
##
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------
## 14 - purpose (factor)
##
## length n NAs unique levels dupes
## 358'014 358'014 0 14 14 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 debt_consolidation 216'366 60.4% 216'366 60.4%
## 2 credit_card 77'681 21.7% 294'047 82.1%
## 3 home_improvement 21'327 6.0% 315'374 88.1%
## 4 other 17'542 4.9% 332'916 93.0%
## 5 major_purchase 6'838 1.9% 339'754 94.9%
## 6 small_business 3'939 1.1% 343'693 96.0%
## 7 medical 3'559 1.0% 347'252 97.0%
## 8 car 3'282 0.9% 350'534 97.9%
## 9 moving 2'343 0.7% 352'877 98.6%
## 10 vacation 2'120 0.6% 354'997 99.2%
## 11 house 1'819 0.5% 356'816 99.7%
## 12 wedding 956 0.3% 357'772 99.9%
## ... etc.
## [list output truncated]

## ------------------------------------------------------------------------------
## 15 - title (factor)
##
## length n NAs unique levels dupes
## 358'014 358'014 0 34'276 48'818 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 Debt consolidation 152'037 42.5% 152'037 42.5%
## 2 Credit card refinancing 51'470 14.4% 203'507 56.8%
## 3 Home improvement 15'185 4.2% 218'692 61.1%
## 4 Other 12'841 3.6% 231'533 64.7%
## 5 Debt Consolidation 9'292 2.6% 240'825 67.3%
## 6 Major purchase 4'751 1.3% 245'576 68.6%
## 7 Consolidation 3'338 0.9% 248'914 69.5%
## 8 debt consolidation 3'024 0.8% 251'938 70.4%
## 9 Business 2'906 0.8% 254'844 71.2%
## 10 Medical expenses 2'729 0.8% 257'573 71.9%
## 11 Car financing 2'134 0.6% 259'707 72.5%
## 12 1'744 0.5% 261'451 73.0%
## ... etc.
## [list output truncated]

## ------------------------------------------------------------------------------
## 16 - dti (numeric)
##
## length n NAs unique 0s mean meanCI'
## 358'014 358'014 0 4'262 139 17.79 17.73
## 100.0% 0.0% 0.0% 17.85
##
## .05 .10 .25 median .75 .90 .95
## 5.04 7.26 11.63 17.29 23.49 28.96 31.97
##
## range sd vcoef mad IQR skew kurt
## 9'999.00 18.78 1.06 8.76 11.86 421.09 222'991.82
##
## lowest : 0.0 (139), 0.01 (5), 0.02 (7), 0.03 (3), 0.04 (3)
## highest: 145.65, 189.9, 380.53, 1'622.0, 9'999.0
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 17 - earliest_cr_line (factor)
##
## length n NAs unique levels dupes
## 358'014 358'014 0 680 684 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 Aug-2001 2'680 0.7% 2'680 0.7%
## 2 Oct-2000 2'678 0.7% 5'358 1.5%
## 3 Aug-2000 2'671 0.7% 8'029 2.2%
## 4 Oct-2001 2'630 0.7% 10'659 3.0%
## 5 Aug-2002 2'437 0.7% 13'096 3.7%
## 6 Sep-2000 2'416 0.7% 15'512 4.3%
## 7 Nov-2000 2'412 0.7% 17'924 5.0%
## 8 Oct-1999 2'392 0.7% 20'316 5.7%
## 9 Oct-2002 2'392 0.7% 22'708 6.3%
## 10 Nov-1999 2'388 0.7% 25'096 7.0%
## 11 Aug-1999 2'314 0.6% 27'410 7.7%
## 12 Sep-2002 2'308 0.6% 29'718 8.3%
## ... etc.
## [list output truncated]

## ------------------------------------------------------------------------------
## 18 - open_acc (numeric)
##
## length n NAs unique 0s mean meanCI'
## 358'014 358'014 0 60 0 11.52 11.50
## 100.0% 0.0% 0.0% 11.54
##
## .05 .10 .25 median .75 .90 .95
## 5.00 6.00 8.00 11.00 14.00 18.00 21.00
##
## range sd vcoef mad IQR skew kurt
## 89.00 5.17 0.45 4.45 6.00 1.23 3.01
##
## lowest : 1.0 (71), 2.0 (879), 3.0 (3'410), 4.0 (8'559), 5.0 (15'383)
## highest: 56.0 (2), 57.0, 58.0, 76.0 (2), 90.0
##
## heap(?): remarkable frequency (9.3%) for the mode(s) (= 9)
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 19 - pub_rec (numeric)
##
## length n NAs unique 0s mean meanCI'
## 358'014 358'014 0 20 302'144 0.19 0.19
## 100.0% 0.0% 84.4% 0.19
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 0.00 0.00 0.00 1.00 1.00
##
## range sd vcoef mad IQR skew kurt
## 86.00 0.55 2.88 0.00 0.00 16.25 1'770.23
##
## lowest : 0.0 (302'144), 1.0 (47'904), 2.0 (5'431), 3.0 (1'514), 4.0 (526)
## highest: 17.0, 19.0 (2), 24.0, 40.0, 86.0
##
## heap(?): remarkable frequency (84.4%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 20 - revol_bal (numeric)
##
## length n NAs unique 0s mean meanCI'
## 358'014 358'014 0 54'104 1'091 16'088.61 16'019.73
## 100.0% 0.0% 0.3% 16'157.50
##
## .05 .10 .25 median .75 .90 .95
## 1'961.00 3'336.00 6'222.00 11'363.00 19'852.00 31'685.70 41'119.35
##
## range sd vcoef mad IQR skew kurt
## 1'743'266.00 21'029.89 1.31 9'052.76 13'630.00 12.02 389.58
##
## lowest : 0.0 (1'091), 1.0 (19), 2.0 (21), 3.0 (21), 4.0 (17)
## highest: 1'023'940.0, 1'030'826.0, 1'190'046.0, 1'298'783.0, 1'743'266.0
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 21 - revol_util (numeric)
##
## length n NAs unique 0s mean meanCI'
## 358'014 358'014 0 1'160 1'347 54.19 54.11
## 100.0% 0.0% 0.4% 54.27
##
## .05 .10 .25 median .75 .90 .95
## 12.70 21.00 36.60 55.10 72.80 86.00 91.90
##
## range sd vcoef mad IQR skew kurt
## 892.30 24.00 0.44 26.84 36.20 -0.05 3.39
##
## lowest : 0.0 (1'347), 0.1 (197), 0.2 (161), 0.3 (146), 0.4 (148)
## highest: 148.0, 150.7, 152.5, 153.0, 892.3
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 22 - total_acc (numeric)
##
## length n NAs unique 0s mean meanCI'
## 358'014 358'014 0 118 0 25.77 25.73
## 100.0% 0.0% 0.0% 25.81
##
## .05 .10 .25 median .75 .90 .95
## 9.00 12.00 17.00 24.00 33.00 42.00 48.00
##
## range sd vcoef mad IQR skew kurt
## 149.00 11.90 0.46 11.86 16.00 0.87 1.25
##
## lowest : 2.0 (16), 3.0 (172), 4.0 (869), 5.0 (1'518), 6.0 (2'316)
## highest: 124.0, 129.0, 135.0, 150.0, 151.0
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 23 - initial_list_status (factor - dichotomous)
##
## length n NAs unique
## 358'014 358'014 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## f 200'141 55.9% 55.7% 56.1%
## w 157'873 44.1% 43.9% 44.3%
##
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------
## 24 - application_type (factor)
##
## length n NAs unique levels dupes
## 358'014 358'014 0 3 3 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 INDIVIDUAL 357'303 99.8% 357'303 99.8%
## 2 JOINT 425 0.1% 357'728 99.9%
## 3 DIRECT_PAY 286 0.1% 358'014 100.0%

## ------------------------------------------------------------------------------
## 25 - mort_acc (numeric)
##
## length n NAs unique 0s mean meanCI'
## 358'014 358'014 0 33 139'708 1.81 1.81
## 100.0% 0.0% 39.0% 1.82
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 0.00 1.00 3.00 5.00 6.00
##
## range sd vcoef mad IQR skew kurt
## 34.00 2.15 1.18 1.48 3.00 1.60 4.48
##
## lowest : 0.0 (139'708), 1.0 (60'384), 2.0 (49'907), 3.0 (38'022), 4.0 (27'865)
## highest: 28.0, 30.0, 31.0 (2), 32.0 (2), 34.0
##
## heap(?): remarkable frequency (39.0%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 26 - pub_rec_bankruptcies (numeric)
##
## length n NAs unique 0s mean meanCI'
## 358'014 358'014 0 9 314'399 0.13 0.13
## 100.0% 0.0% 87.8% 0.13
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 0.00 0.00 0.00 1.00 1.00
##
## range sd vcoef mad IQR skew kurt
## 8.00 0.37 2.83 0.00 0.00 3.31 17.04
##
##
## level freq perc cumfreq cumperc
## 1 0 314'399 87.8% 314'399 87.8%
## 2 1 41'297 11.5% 355'696 99.4%
## 3 2 1'840 0.5% 357'536 99.9%
## 4 3 351 0.1% 357'887 100.0%
## 5 4 82 0.0% 357'969 100.0%
## 6 5 32 0.0% 358'001 100.0%
## 7 6 7 0.0% 358'008 100.0%
## 8 7 4 0.0% 358'012 100.0%
## 9 8 2 0.0% 358'014 100.0%
##
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------
## 27 - address (factor)
##
## length n NAs unique levels dupes
## 358'014 358'014 0 356'061 393'700 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 USNS Johnson\nFPO AE 05113 8 0.0% 8 0.0%
## 2 USS Johnson\nFPO AE 48052 7 0.0% 15 0.0%
## 3 USS Smith\nFPO AP 70466 7 0.0% 22 0.0%
## 4 USCGC Jones\nFPO AE 22690 6 0.0% 28 0.0%
## 5 USCGC Miller\nFPO AA 22690 6 0.0% 34 0.0%
## 6 USCGC Smith\nFPO AA 70466 6 0.0% 40 0.0%
## 7 USNS Johnson\nFPO AA 70466 6 0.0% 46 0.0%
## 8 USNS Johnson\nFPO AP 48052 6 0.0% 52 0.0%
## 9 USNV Brown\nFPO AA 48052 6 0.0% 58 0.0%
## 10 USNV Smith\nFPO AA 00813 6 0.0% 64 0.0%
## 11 USS Smith\nFPO AP 22690 6 0.0% 70 0.0%
## 12 USCGC Brown\nFPO AA 30723 5 0.0% 75 0.0%
## ... etc.
## [list output truncated]

summary(lend)
## loan_amnt term int_rate installment
## Min. : 1000 36 months:273605 Min. : 5.32 Min. : 21.62
## 1st Qu.: 8000 60 months: 84409 1st Qu.:10.74 1st Qu.: 260.46
## Median :12000 Median :13.44 Median : 385.12
## Mean :14387 Mean :13.80 Mean : 441.82
## 3rd Qu.:20000 3rd Qu.:16.78 3rd Qu.: 580.45
## Max. :40000 Max. :30.99 Max. :1533.81
##
## grade sub_grade emp_title emp_length
## A: 54255 B3 : 23768 : 20486 10+ years:117323
## B:104416 B4 : 23219 Teacher : 4387 2 years : 31720
## C: 98353 C1 : 21612 Manager : 4249 3 years : 27866
## D: 58558 C2 : 20617 Registered Nurse: 1855 < 1 year : 27538
## E: 28871 B2 : 20491 RN : 1844 5 years : 23345
## F: 10792 C3 : 19840 Supervisor : 1830 1 year : 22841
## G: 2769 (Other):228467 (Other) :323363 (Other) :107381
## home_ownership annual_inc verification_status issue_d
## ANY : 3 Min. : 0 Not Verified :109640 Oct-2014: 14838
## MORTGAGE:181592 1st Qu.: 45401 Source Verified:121220 Jul-2014: 12597
## NONE : 29 Median : 65000 Verified :127154 Jan-2015: 11701
## OTHER : 34 Mean : 74746 Dec-2013: 10609
## OWN : 34752 3rd Qu.: 90000 Nov-2013: 10492
## RENT :141604 Max. :8706582 Jul-2015: 10260
## (Other) :287517
## loan_status purpose
## Charged Off: 72078 debt_consolidation:216366
## Fully Paid :285936 credit_card : 77681
## home_improvement : 21327
## other : 17542
## major_purchase : 6838
## small_business : 3939
## (Other) : 14321
## title dti earliest_cr_line
## Debt consolidation :152037 Min. : 0.00 Aug-2001: 2680
## Credit card refinancing: 51470 1st Qu.: 11.63 Oct-2000: 2678
## Home improvement : 15185 Median : 17.29 Aug-2000: 2671
## Other : 12841 Mean : 17.79 Oct-2001: 2630
## Debt Consolidation : 9292 3rd Qu.: 23.49 Aug-2002: 2437
## Major purchase : 4751 Max. :9999.00 Sep-2000: 2416
## (Other) :112438 (Other) :342502
## open_acc pub_rec revol_bal revol_util
## Min. : 1.00 Min. : 0.0000 Min. : 0 Min. : 0.00
## 1st Qu.: 8.00 1st Qu.: 0.0000 1st Qu.: 6222 1st Qu.: 36.60
## Median :11.00 Median : 0.0000 Median : 11363 Median : 55.10
## Mean :11.52 Mean : 0.1917 Mean : 16089 Mean : 54.19
## 3rd Qu.:14.00 3rd Qu.: 0.0000 3rd Qu.: 19852 3rd Qu.: 72.80
## Max. :90.00 Max. :86.0000 Max. :1743266 Max. :892.30
##
## total_acc initial_list_status application_type mort_acc
## Min. : 2.00 f:200141 DIRECT_PAY: 286 Min. : 0.000
## 1st Qu.: 17.00 w:157873 INDIVIDUAL:357303 1st Qu.: 0.000
## Median : 24.00 JOINT : 425 Median : 1.000
## Mean : 25.77 Mean : 1.814
## 3rd Qu.: 33.00 3rd Qu.: 3.000
## Max. :151.00 Max. :34.000
##
## pub_rec_bankruptcies address
## Min. :0.0000 USNS Johnson\nFPO AE 05113: 8
## 1st Qu.:0.0000 USS Johnson\nFPO AE 48052 : 7
## Median :0.0000 USS Smith\nFPO AP 70466 : 7
## Mean :0.1302 USCGC Jones\nFPO AE 22690 : 6
## 3rd Qu.:0.0000 USCGC Miller\nFPO AA 22690: 6
## Max. :8.0000 USCGC Smith\nFPO AA 70466 : 6
## (Other) :357974
library(pastecs)
##
## Attaching package: 'pastecs'
## The following object is masked from 'package:tidyr':
##
## extract
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:pastecs':
##
## first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
lendsub = select(lend, c('loan_amnt','int_rate','installment','annual_inc',
'dti','open_acc','pub_rec','revol_bal','revol_util','total_acc',
'mort_acc','pub_rec_bankruptcies'))
stat.desc(lendsub)
## loan_amnt int_rate installment annual_inc dti
## nbr.val 3.580140e+05 3.580140e+05 3.580140e+05 3.580140e+05 3.580140e+05
## nbr.null 0.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00 1.390000e+02
## nbr.na 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## min 1.000000e+03 5.320000e+00 2.162000e+01 0.000000e+00 0.000000e+00
## max 4.000000e+04 3.099000e+01 1.533810e+03 8.706582e+06 9.999000e+03
## range 3.900000e+04 2.567000e+01 1.512190e+03 8.706582e+06 9.999000e+03
## sum 5.150712e+09 4.942115e+06 1.581771e+08 2.676028e+10 6.368604e+06
## median 1.200000e+04 1.344000e+01 3.851200e+02 6.500000e+04 1.729000e+01
## mean 1.438690e+04 1.380425e+01 4.418181e+02 7.474646e+04 1.778870e+01
## SE.mean 1.401388e+01 7.528314e-03 4.210157e-01 1.026289e+02 3.138754e-02
## CI.mean.0.95 2.746679e+01 1.475527e-02 8.251784e-01 2.011497e+02 6.151865e-02
## var 7.030995e+07 2.029063e+01 6.345949e+04 3.770853e+09 3.527073e+02
## std.dev 8.385103e+03 4.504512e+00 2.519117e+02 6.140727e+04 1.878050e+01
## coef.var 5.828289e-01 3.263134e-01 5.701705e-01 8.215409e-01 1.055755e+00
## open_acc pub_rec revol_bal revol_util total_acc
## nbr.val 3.580140e+05 3.580140e+05 3.580140e+05 3.580140e+05 3.580140e+05
## nbr.null 0.000000e+00 3.021440e+05 1.091000e+03 1.347000e+03 0.000000e+00
## nbr.na 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## min 1.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 2.000000e+00
## max 9.000000e+01 8.600000e+01 1.743266e+06 8.923000e+02 1.510000e+02
## range 8.900000e+01 8.600000e+01 1.743266e+06 8.923000e+02 1.490000e+02
## sum 4.124980e+06 6.861900e+04 5.759949e+09 1.940195e+07 9.225534e+06
## median 1.100000e+01 0.000000e+00 1.136300e+04 5.510000e+01 2.400000e+01
## mean 1.152184e+01 1.916657e-01 1.608861e+04 5.419326e+01 2.576864e+01
## SE.mean 8.639367e-03 9.215629e-04 3.514689e+01 4.010819e-02 1.988711e-02
## CI.mean.0.95 1.693291e-02 1.806236e-03 6.888688e+01 7.861087e-02 3.897815e-02
## var 2.672169e+01 3.040535e-01 4.422561e+08 5.759252e+02 1.415935e+02
## std.dev 5.169302e+00 5.514104e-01 2.102989e+04 2.399844e+01 1.189931e+01
## coef.var 4.486525e-01 2.876939e+00 1.307129e+00 4.428307e-01 4.617747e-01
## mort_acc pub_rec_bankruptcies
## nbr.val 3.580140e+05 3.580140e+05
## nbr.null 1.397080e+05 3.143990e+05
## nbr.na 0.000000e+00 0.000000e+00
## min 0.000000e+00 0.000000e+00
## max 3.400000e+01 8.000000e+00
## range 3.400000e+01 8.000000e+00
## sum 6.493630e+05 4.660400e+04
## median 1.000000e+00 0.000000e+00
## mean 1.813792e+00 1.301737e-01
## SE.mean 3.589744e-03 6.147393e-04
## CI.mean.0.95 7.035793e-03 1.204871e-03
## var 4.613462e+00 1.352950e-01
## std.dev 2.147897e+00 3.678248e-01
## coef.var 1.184202e+00 2.825646e+00
library(ggplot2)
lend = drop_na(lend)
qplot(lend$loan_amnt,
geom="histogram",
main="Histogram for loan Amount",
xlab="Loan Amount",
fill=I("lightblue"),
col=I("black"),
binwidth=1000)

par(mfrow = c(2,2))
#specify the margin
par(mar = rep(2, 4))
hist(lend$int_rate, ylim = c(0,40000), col = 'lightblue', main = 'Interest Rate')
hist(lend$installment, ylim = c(0,80000),col = 'lightblue', main = 'Installment')
plot(lend$term, ylim = c(0,350000), col = 'lightblue', main = 'Loan Term')
plot(lend$loan_status, ylim = c(0,350000), col = 'lightblue', main = 'Loan Status')

ggplot(lend, aes(x = grade, y = loan_amnt)) +
geom_boxplot(fill = "steelblue3", colour = "black",
outlier.colour = "black", outlier.shape = 1) +
labs(title="Loan Amount by Grade", x = "Grade", y = "Loan Amount \n")

ggplot(lend, aes(grade, int_rate)) +
geom_boxplot(fill = "steelblue3", colour = "black",
outlier.colour = "black", outlier.shape = 1) +
labs(title ="Interest Rate by Grade", x = "Grade", y = "Interest Rate \n")

ggplot(lend, aes(home_ownership, int_rate)) +
geom_boxplot(fill = "steelblue3", colour = "black",
outlier.colour = "black", outlier.shape = 1) +
labs(title="Interest Rate by Home Ownership", x = "Home Ownership", y = "Interest Rate \n")

ggplot(lend, aes(term, loan_amnt)) +
geom_boxplot(fill = "steelblue3", colour = "black",
outlier.colour = "black", outlier.shape = 1) +
labs(title="Loan Amount by Term", x = "Term", y = "Loan Amount \n")

table(lend$purpose)
##
## car credit_card debt_consolidation educational
## 3282 77681 216366 1
## home_improvement house major_purchase medical
## 21327 1819 6838 3559
## moving other renewable_energy small_business
## 2343 17542 241 3939
## vacation wedding
## 2120 956
info = c(4697, 83019, 234507, 257, 24030, 2201, 8790, 4196, 2854, 21185, 329, 5701, 2452, 1812)
names = c("car", "credit_card", "debt_consolidation", "educational", "home_improvement", "house", "major_purchase", "medical", "moving", "other", "renewable_energy", "small_business", "vacation", "wedding")
pie(info, labels=names, main = "Purpose")
legend("topright", names, cex=0.5, fill = rainbow(length(info)))

library(ggcorrplot)
df <- dplyr::select_if(lend, is.numeric)
r <- cor(df, use="complete.obs")
round(r,2)
## loan_amnt int_rate installment annual_inc dti open_acc
## loan_amnt 1.00 0.15 0.96 0.34 0.01 0.19
## int_rate 0.15 1.00 0.14 -0.07 0.07 0.00
## installment 0.96 0.14 1.00 0.34 0.01 0.18
## annual_inc 0.34 -0.07 0.34 1.00 -0.08 0.13
## dti 0.01 0.07 0.01 -0.08 1.00 0.13
## open_acc 0.19 0.00 0.18 0.13 0.13 1.00
## pub_rec -0.09 0.05 -0.08 -0.02 -0.02 -0.03
## revol_bal 0.33 -0.02 0.31 0.30 0.06 0.21
## revol_util 0.10 0.27 0.12 0.03 0.08 -0.14
## total_acc 0.21 -0.05 0.19 0.19 0.09 0.68
## mort_acc 0.22 -0.08 0.19 0.24 -0.03 0.11
## pub_rec_bankruptcies -0.12 0.05 -0.11 -0.06 -0.02 -0.04
## pub_rec revol_bal revol_util total_acc mort_acc
## loan_amnt -0.09 0.33 0.10 0.21 0.22
## int_rate 0.05 -0.02 0.27 -0.05 -0.08
## installment -0.08 0.31 0.12 0.19 0.19
## annual_inc -0.02 0.30 0.03 0.19 0.24
## dti -0.02 0.06 0.08 0.09 -0.03
## open_acc -0.03 0.21 -0.14 0.68 0.11
## pub_rec 1.00 -0.11 -0.09 0.01 0.01
## revol_bal -0.11 1.00 0.22 0.18 0.20
## revol_util -0.09 0.22 1.00 -0.11 0.01
## total_acc 0.01 0.18 -0.11 1.00 0.38
## mort_acc 0.01 0.20 0.01 0.38 1.00
## pub_rec_bankruptcies 0.69 -0.13 -0.10 0.04 0.03
## pub_rec_bankruptcies
## loan_amnt -0.12
## int_rate 0.05
## installment -0.11
## annual_inc -0.06
## dti -0.02
## open_acc -0.04
## pub_rec 0.69
## revol_bal -0.13
## revol_util -0.10
## total_acc 0.04
## mort_acc 0.03
## pub_rec_bankruptcies 1.00
ggcorrplot(r,
hc.order = TRUE,
type = "lower", lab = TRUE)

input <- lend[,c("loan_status","loan_amnt","annual_inc","int_rate","installment", "revol_bal", "revol_util")]
dim(input)
## [1] 358014 7
train <- input[1:286411,]
test <- input[286412:358014,]
lend_glm<-glm(formula = input$loan_status ~ input$loan_amnt +
input$annual_inc + input$int_rate + input$installment + input$revol_bal +
input$revol_util, family = binomial, data = train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(lend_glm)
##
## Call:
## glm(formula = input$loan_status ~ input$loan_amnt + input$annual_inc +
## input$int_rate + input$installment + input$revol_bal + input$revol_util,
## family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.4904 0.3861 0.5473 0.6909 1.6076
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.175e+00 1.900e-02 167.097 <2e-16 ***
## input$loan_amnt -6.608e-05 1.634e-06 -40.436 <2e-16 ***
## input$annual_inc 5.664e-06 1.412e-07 40.111 <2e-16 ***
## input$int_rate -1.255e-01 1.021e-03 -122.916 <2e-16 ***
## input$installment 1.713e-03 5.475e-05 31.283 <2e-16 ***
## input$revol_bal 5.856e-07 2.850e-07 2.054 0.04 *
## input$revol_util -3.144e-03 1.959e-04 -16.049 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 359616 on 358013 degrees of freedom
## Residual deviance: 334527 on 358007 degrees of freedom
## AIC: 334541
##
## Number of Fisher Scoring iterations: 5
lend_lm1<-lm(formula = input$loan_amnt ~ input$loan_status +
input$annual_inc + input$int_rate + input$installment +
input$revol_bal + input$revol_util,data =train)
summary(lend_lm1)
##
## Call:
## lm(formula = input$loan_amnt ~ input$loan_status + input$annual_inc +
## input$int_rate + input$installment + input$revol_bal + input$revol_util,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31159.8 -1252.1 -576.0 295.9 13631.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.370e+02 1.961e+01 27.39 <2e-16 ***
## input$loan_statusFully Paid -4.219e+02 1.053e+01 -40.06 <2e-16 ***
## input$annual_inc 2.917e-03 7.298e-05 39.97 <2e-16 ***
## input$int_rate 3.864e+01 9.879e-01 39.12 <2e-16 ***
## input$installment 3.121e+01 1.803e-02 1730.77 <2e-16 ***
## input$revol_bal 1.315e-02 2.154e-04 61.07 <2e-16 ***
## input$revol_util -1.042e+01 1.823e-01 -57.18 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2445 on 358007 degrees of freedom
## Multiple R-squared: 0.9149, Adjusted R-squared: 0.9149
## F-statistic: 6.419e+05 on 6 and 358007 DF, p-value: < 2.2e-16
lend_lm2<-lm(formula = input$loan_amnt~ input$revol_bal+
input$annual_inc + input$installment, data =train)
summary(lend_lm2)
##
## Call:
## lm(formula = input$loan_amnt ~ input$revol_bal + input$annual_inc +
## input$installment, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26291.5 -1243.4 -668.1 143.1 14080.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.932e+02 8.770e+00 22.02 <2e-16 ***
## input$revol_bal 1.010e-02 2.116e-04 47.74 <2e-16 ***
## input$annual_inc 2.616e-03 7.302e-05 35.82 <2e-16 ***
## input$installment 3.132e+01 1.788e-02 1751.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2466 on 358010 degrees of freedom
## Multiple R-squared: 0.9135, Adjusted R-squared: 0.9135
## F-statistic: 1.26e+06 on 3 and 358010 DF, p-value: < 2.2e-16